#!pip install ipython-sql   #Requirement already satisfied


# pip install mysqlclient   #Requirement already satisfied


import sqlalchemy


sqlalchemy.create_engine('mysql://root:Tikslas@localhost/ess10_db')

Engine(mysql://root:***@localhost/ess10_db)


%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


%sql mysql://root:Tikslas@localhost/ess10_db


%%sql
show tables

 * mysql://root:***@localhost/ess10_db
1 rows affected.


%%sql
select * from ess10_sql limit 10

 * mysql://root:***@localhost/ess10_db
10 rows affected.


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


tyrimas = pd.read_csv("ess.csv", index_col=0)
tyrimas.head()


tyrimas.shape

(37611, 24)


tyrimas.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37611 entries, 10038 to 27858
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   country             37611 non-null  object 
 1   gender              37611 non-null  int64  
 2   age                 37319 non-null  float64
 3   health              37611 non-null  int64  
 4   marital             37547 non-null  float64
 5   children            24300 non-null  float64
 6   domicil             37581 non-null  float64
 7   edu_isced           37487 non-null  float64
 8   eduyrs              37276 non-null  float64
 9   activity            37578 non-null  float64
 10  emplrel             33399 non-null  float64
 11  contract            28963 non-null  float64
 12  organisation        33363 non-null  float64
 13  main_source_income  37092 non-null  float64
 14  hinctnta            29380 non-null  float64
 15  hincfel             37452 non-null  float64
 16  father_edu          34659 non-null  float64
 17  emprf14             36431 non-null  float64
 18  occf14b             32863 non-null  float64
 19  mother_edu          32905 non-null  float64
 20  emprm14             36944 non-null  float64
 21  occm14b             22604 non-null  float64
 22  course              37518 non-null  float64
 23  region              37611 non-null  object 
dtypes: float64(20), int64(2), object(2)
memory usage: 7.2+ MB


tyrimas.rename(columns={'domicil': 'area', 
                        'hinctnta': 'income',
                        'hincfel': 'feel_inc',
                        'emprf14': 'father_empl',
                        'occf14b': 'father_occ',
                        'emprm14': 'mother_empl',
                        'occm14b': 'mother_occ'}, inplace=True)
tyrimas.head()


# Išfiltruojami tik Lietuvos duomenys
df1 = tyrimas[(tyrimas['country'] == 'LT')]


# Patikrinama, kiek respondentų yra imtyje
df1.shape

(1659, 24)


# Pagal lytį 1-vyras, 2-moteris
sns.countplot(x='gender', data=df1, palette='rocket')
plt.show()


df1["gender"].value_counts()

gender
2    1021
1     638
Name: count, dtype: int64


# Pagal amžių gaunasi netvarkingai, todėl sukuriamos amžiaus grupės
def age_group(age):
    if age <= 20:
        return '0-20'
    elif age <= 35:
        return '21-35'
    elif age <= 50:
        return '36-50'
    elif age <= 65:
        return '51-65'
    else:
        return '66+'

# Sukuriamas naujas stulpelis
df1['age group'] = df1['age'].apply(age_group)

C:\Users\viole\AppData\Local\Temp\ipykernel_8608\2823441412.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['age group'] = df1['age'].apply(age_group)


sns.countplot(x='age group', data=df1, palette='rocket')
plt.show()


df1["age group"].value_counts()

age group
51-65    484
66+      409
36-50    387
21-35    304
0-20      75
Name: count, dtype: int64


# Pagal užimtumą: 1- Paid work, 2- Education, 3- Unemployed, looking for job, 4 - Unemployed, not looking for job, 5 - Permanently sick or disabled
# 6- Retired, 7- Community or military service, 8 - Housework, looking after children, others, 9- Other
# Nors duomenys buvo tvarkyti, tačiau vis vien įsimaišė 77 - refusal
sns.countplot(x='activity', data=df1, palette='rocket')
plt.show()


# Pagal užimtumą: 1- Paid work, 2- Education, 3- Unemployed, looking for job, 4 - Unemployed, not looking for job, 5 - Permanently sick or disabled
# 6- Retired, 7- Community or military service, 8 - Housework, looking after children, others, 9- Other
# Nors duomenys buvo tvarkyti, tačiau vis vien įsimaišė 77 - refusal
df1["activity"].value_counts()

activity
1.0     806
6.0     438
2.0      98
8.0      84
3.0      79
5.0      75
4.0      40
77.0     27
9.0      12
Name: count, dtype: int64


# Tikrinama kaip apmokamas darbas pasiskirsto pagal amžiaus grupes
df2 = df1[(df1['activity'] == 1)]


sns.countplot(x='age group', data=df2, palette='rocket')
plt.show()


# Ankstesniu skaičiavimu negalima pilnai pasitikėti, nes vaizduojami absoliutūs skaičiai.
# Tarp dviejų amžiaus grupių nedidelis skirtumas, tad įsitikinimui reikėtų procentinės išraiškos.
df3 = df1[(df1['age group'] == "36-50")]


# Skaičiuojami procentai "36-50" grupei
jaunesni = df3['activity'].value_counts()
total_count = len(df3)
jaunesni_proc = (jaunesni / total_count) * 100
jaunesni_proc

activity
1.0     79.069767
8.0      6.201550
3.0      4.651163
5.0      3.100775
77.0     2.842377
4.0      2.583979
6.0      0.775194
9.0      0.775194
Name: count, dtype: float64


# Skaičiuojami procentai "51-65" grupei
df4 = df1[(df1['age group'] == "51-65")]


vyresni = df4['activity'].value_counts()
total_count = len(df4)
vyresni_proc = (vyresni / total_count) * 100
vyresni_proc

activity
1.0     58.264463
6.0     11.983471
5.0     10.537190
3.0      9.504132
4.0      4.132231
8.0      3.305785
9.0      1.446281
77.0     0.619835
2.0      0.206612
Name: count, dtype: float64


df5 = df1[(df1['activity'] < 77)] # pašalinamas netyčia pasipainiojęs "refusal"


# Pagal lytį 1-vyras, 2-moteris
sns.jointplot(x='age', 
              y='activity', 
              data=df5,
               hue='gender')
             
plt.show()

C:\Users\viole\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\viole\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):


df2.groupby(["gender"])["age"].max()

gender
1    78.0
2    83.0
Name: age, dtype: float64


# Ankstesnių rezultatų nepakanka daryti išvadų apie visą amžiaus grupę, todėl skaičiuojama toliau
ilgiausiai_dirba = df1[(df1['age group'] == "66+") & (df1['activity'] == 1)]


# Skaičiuojami procentai pagal lytį
lytis = ilgiausiai_dirba['gender'].value_counts()
total_count = len(ilgiausiai_dirba)
lytis_proc = (lytis/ total_count) * 100
lytis_proc

gender
1    52.941176
2    47.058824
Name: count, dtype: float64


df6 = df1[(df1['health'] < 6)] # pašalinamas pasipainiojęs "refusal"


# 1	Very good; 2 Good; 3 Fair; 4 Bad; 5	Very bad

sns.regplot(data=df6, 
            x='age', 
            y='health') 
plt.show()


# Patikrinama kaip vertinimų vidurkiai pasiskirsto amžiaus grupėse
df6.groupby(["age group"])["health"].mean()

age group
0-20     1.800000
21-35    1.953947
36-50    2.222798
51-65    2.691511
66+      3.132353
Name: health, dtype: float64


# Reikalingi tik dirbantys
df1['dirbantys'] = df1['activity'] == 1

C:\Users\viole\AppData\Local\Temp\ipykernel_8608\3863111863.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['dirbantys'] = df1['activity'] == 1


# Reikia skaitinio stulpelio
df1['dirba_binarinis'] = df1['dirbantys'].astype(int)
df1.head()

C:\Users\viole\AppData\Local\Temp\ipykernel_8608\1416501714.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['dirba_binarinis'] = df1['dirbantys'].astype(int)


# Remiamasi mokslininkų tyrimais, kad užimtumas mažėja nuo 55 metų. Tuo pačiu pašalinami pasipainioję "refusal"
df7 = df1[(df1['age'] > 55) & (df1['eduyrs'] < 50)]


df7.plot.scatter('eduyrs', 'dirba_binarinis')
plt.show()


# Mašininio mokymosi dalis
from sklearn.linear_model import LogisticRegression


X=df7[["eduyrs"]] 
X.head()


y=df7["dirba_binarinis"] 
y.head()

idno
10024    0
10061    0
10076    0
10079    0
10090    0
Name: dirba_binarinis, dtype: int32


model = LogisticRegression()
model

LogisticRegression()

LogisticRegression()


from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train


model.fit(X_train, y_train)

LogisticRegression()

LogisticRegression()


model.predict(X_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


# Modelio tikslumas 70 proc.
model.score(X_test, y_test)

0.7089201877934272


from sklearn import metrics


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
model = LogisticRegression()
model.fit(X_train, y_train)
y_predicted = model.predict(X_test)
print(model.score(X_test, y_test))
print(metrics.confusion_matrix(y_test, y_predicted))

0.7089201877934272
[[149   3]
 [ 59   2]]


cf_matrix = metrics.confusion_matrix(y_test, y_predicted)
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, 
fmt='.2%', cmap='Blues')

<Axes: >


pd.DataFrame(metrics.confusion_matrix(y_test, y_predicted),
             columns=['nedirba', 'dirba'],
             index=['nedirba', 'dirba']).rename_axis(index='Actual', columns='predicted')

idno	country	gender	age	health
10002	BG	2	76	2
10003	FI	2	24	1
10004	CH	1	57	2
10005	FR	2	23	2
10006	BG	1	43	2
10007	FR	2	32	2
10009	BG	2	50	3
10011	FR	2	25	2
10014	CH	2	33	1
10017	FI	2	33	4

	eduyrs
idno
23918	13.0
11989	15.0
12712	11.0
15430	8.0
24832	16.0
...	...
24227	14.0
10283	15.0
25616	11.0
24175	14.0
27152	12.0

Vyresnio amžiaus asmenų užimtumo veiksniai ¶

1. Duomenų įkėlimas¶

1.1. Duomenų įkėlimas prisijungus prie MySQL serverio¶

1.2. Duomenų įkėlimas iš csv¶

2. Duomenų tvarkymas¶

3. Tyrimo imties charakteristikos¶

4. Tikrinama H1: Vyresnio amžiaus asmenų dalyvavimas darbo rinkoje yra mažesnis nei jaunesnių¶

5. Tikrinama H2: Užimtumo struktūroje vyrai dalyvauja ilgiau už moteris¶

6. Tikrinama H3: Vyresnio amžiaus asmenys yra linkę prasčiau vertinti savo sveikatą¶

7. Tikrinama H4: Aukštesnis išsilavinimas yra susijęs su aktyvesniu užimtumu vyresniame amžiuje.¶

8. Mašininis mokymasis¶

Išvados¶

	country	gender	age	health	marital	children	domicil	edu_isced	eduyrs	activity	...	hinctnta	hincfel	father_edu	emprf14	occf14b	mother_edu	emprm14	occm14b	course	region
idno
10038	BE	2	16.0	2	6.0	2.0	4.0	1.0	11.0	2.0	...	NaN	2.0	4.0	1.0	NaN	6.0	1.0	5.0	2.0	BE24
10053	BE	2	24.0	2	6.0	2.0	4.0	7.0	18.0	3.0	...	NaN	1.0	7.0	1.0	2.0	7.0	2.0	3.0	1.0	BE24
10055	BE	1	58.0	2	1.0	2.0	4.0	6.0	18.0	5.0	...	7.0	2.0	5.0	4.0	NaN	5.0	1.0	3.0	2.0	BE33
10062	BE	1	35.0	1	4.0	1.0	1.0	4.0	12.0	1.0	...	7.0	2.0	1.0	2.0	8.0	1.0	3.0	NaN	2.0	BE21
10064	BE	1	61.0	1	6.0	2.0	2.0	7.0	19.0	1.0	...	6.0	1.0	6.0	1.0	3.0	6.0	1.0	3.0	2.0	BE24

	country	gender	age	health	marital	children	area_live	edu_isced	eduyrs	activity	...	mother_edu	mother_empl	mother_occ	course	region	age group	ar dirba	ardirba	dirbantys	dirba_binarinis
idno
10018	LT	1	24.0	1	1.0	2.0	1.0	4.0	12.0	1.0	...	NaN	4.0	NaN	2.0	LT022	21-35	1	1	True	1
10024	LT	2	74.0	3	1.0	1.0	3.0	4.0	11.0	6.0	...	NaN	1.0	7.0	2.0	LT026	66+	2	2	False	0
10039	LT	2	37.0	2	6.0	NaN	3.0	5.0	13.0	1.0	...	5.0	1.0	5.0	2.0	LT027	36-50	1	1	True	1
10054	LT	2	49.0	1	4.0	2.0	4.0	6.0	14.0	1.0	...	6.0	1.0	7.0	2.0	LT022	36-50	1	1	True	1
10059	LT	1	54.0	4	1.0	2.0	3.0	3.0	11.0	1.0	...	4.0	1.0	1.0	2.0	LT029	51-65	1	1	True	1

predicted	nedirba	dirba
Actual
nedirba	149	3
dirba	59	2